# steps 4,5, 6 use euclidean distance
library(plotly)
library(seriation)

Question 1.

# keep columns 1,2,5,6,7,9,10,16,17,18,19

p_e <- prices_earnings[, c(1,2,5,6,7,9,10,16,17,18,19)]

rownames(p_e) <- p_e[[1]]

Question 2.

Without doing any reordering We cannot identify any clusters or outliers.

#p_e_sc %>% 
  plot_ly(x =~colnames(p_e_sc), y =~rownames(p_e_sc),
    z = ~p_e_sc, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )

Question 3.

# seriation needs to permute rows and columns, thus distance by row and column
p_e_rdist <- dist(p_e_sc, method = "euclidean")

p_e_cdist <- dist(t(p_e_sc), method = "euclidean")
# make sure that results are reproducible
#set.seed(1011)
# get orders of the row and col distances; Hamilton path length
order1 <- get_order(seriate(p_e_rdist, method = "OLO"))

order2 <-get_order(seriate(p_e_cdist, method = "OLO"))

p_reord <- p_e_sc[rev(order1), order2]
plot_ly(x =~colnames(p_reord), y =~rownames(p_reord),
    z = ~p_reord, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Euclid dist - HC)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )
# computing distance as one minus correlation

p_e_cor <- as.dist((1 - cor(p_e_sc))/2)

p_e_cor1 <- as.dist((1 - cor(t(p_e_sc)))/2)
# set seed to ensure results are reproducible
#set.seed(1212)

# get orders for columns and rows
ord1 <- get_order(seriate(p_e_cor, method = "OLO"))

ord2 <- get_order(seriate(p_e_cor1, method = "OLO"))

# reorder
p_reord2 <- p_e_sc[rev(ord2), ord1]
plot_ly(x =~colnames(p_reord2), y =~rownames(p_reord2),
    z = ~p_reord2, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Cor dist)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )

The ordering by euclidean distance produces a heat map that is easier to analyze. At first glance we can perceive four general regions of two groups. The first group heat map color tends towards a brighter shade of red while the second group tend towards a darker shade of red/black. Although these groups can be seen in the correlation distance heat map, it is not as clear as the first.

Based on the euclidean distance heat map, net wage tends to higher values from Dubai while the number of hours worked decrease. This is the opposite to cities like Delhi,Bankok and Seoul. Interestingly food costs are generally low in the cities with highee working hours. Caracas is an outlier because food costs are high while net wage and the number of hours worked remains low.

Question 4.

# use p_e_rdist and p_e_cdist (euclidean distance)
ord_q4_1 <- get_order(seriate(p_e_rdist, method = "TSP"))

order_q4_2 <-get_order(seriate(p_e_cdist, method = "TSP"))

p_reord_q4 <- p_e_sc[rev(ord_q4_1), order_q4_2]
plot_ly(x =~colnames(p_reord_q4), y =~rownames(p_reord_q4),
    z = ~p_reord_q4, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Euclid dist- TSP)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )
# function creterion to compare unordered distance and ordered
# distance = p_e_rdist (row distance)
# or = order
or1 <- seriate(p_e_rdist, method = "OLO")

or2 <- seriate(p_e_rdist, method = "TSP")

result1 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or1 ))

result2 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or2 ))
result1
##                2SUM AR_deviations AR_events      BAR      Cor_R
## unordered 1012004.5     107139.67     61656 29259.38 0.04268063
## ordered    756120.2      20296.15     26876 19055.56 0.20713342
##           Gradient_raw Gradient_weighted  Inertia Lazy_path_length
## unordered        -4032         -11081.08 17886336        10126.431
## ordered          65528         156151.06 24666913         3713.804
##           Least_squares        LS       ME Moore_stress Neumann_stress
## unordered       3575435 1006126.8 568.2673     986.9925       553.9889
## ordered         3352459  894638.7 652.4429     411.5392       239.0233
##           Path_length      RGAR
## unordered    281.7269 0.5169014
## ordered      121.9671 0.2253186
result2
##                2SUM AR_deviations AR_events      BAR      Cor_R
## unordered 1012004.5     107139.67     61656 29259.38 0.04268063
## ordered    857268.5      50908.36     42881 20078.19 0.10682694
##           Gradient_raw Gradient_weighted  Inertia Lazy_path_length
## unordered        -4032         -11081.08 17886336        10126.431
## ordered          33518          87988.19 21204457         4135.212
##           Least_squares        LS       ME Moore_stress Neumann_stress
## unordered       3575435 1006126.8 568.2673     986.9925       553.9889
## ordered         3443342  940080.6 651.0846     420.4959       242.1392
##           Path_length      RGAR
## unordered    281.7269 0.5169014
## ordered      121.4917 0.3594987

TSP solver has shorter path length compared to HC solver.

Question 5.

# parallel coordinates plot from unsorted scaled data 

p_e_sc2 <- as.data.frame(p_e_sc)

p_e_sc2 <- round(p_e_sc2, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
  dimensions = list(
    list(label = "Food.Costs...", values = ~Food.Costs...),
    list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
    list(label = "Clothing.Index", values = ~Clothing.Index),
    list(label = "Hours.Worked", values = ~Hours.Worked),
    list(label = "Wage.Net", values = ~Wage.Net),
    list(label = "Vacation.Days", values = ~Vacation.Days),
    list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
    list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
    list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
    list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
  )
)
# adding a factored column by iphone column which defines the clusters
p_e_sc2$clust <-ifelse(p_e_sc2$iPhone.4S.hr. < -0.5, 0, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
  line = list(color = ~clust, colorscale = list(c(0, "red"), c(1, "blue"))),
  dimensions = list(
    list(label = "Food.Costs...", values = ~Food.Costs...),
    list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
    list(label = "Clothing.Index", values = ~Clothing.Index),
    list(label = "Hours.Worked", values = ~Hours.Worked),
    list(label = "Wage.Net", values = ~Wage.Net),
    list(label = "Vacation.Days", values = ~Vacation.Days),
    list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
    list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
    list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
    list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
  )
)

We can identify two clusters defined by Wage net (blue) and iphone 4s (red). Wage net has values greater than 0 in the red cluster (defined by iphone 4) while iphone has values has values greater than -0.5 in the blue cluster.